knitr::opts_chunk$set(echo = TRUE, message=FALSE, warning=FALSE,
                      crop=NULL, results = TRUE)
library(tidyverse)
library(DESeq2)
library(knitr)
library(ggbeeswarm)
library(kableExtra)
library(ggrastr)
library(ggpointdensity)
library(viridis)
library(ggpubr)
myTheme <- theme_bw() +
    theme(axis.text = element_text(size = 14, colour="black"),
          axis.title = element_text(size=16, colour="black"),
          axis.ticks=element_line(color="black"),
          axis.ticks.length=unit(.15, "cm"),
          panel.border=element_rect(color="black", fill = NA),
          panel.background = element_blank(),
          plot.background = element_blank(),
          legend.text = element_text(size=12),
          legend.position = "none")
projectDir <- "/Users/mariokeller/projects/HNRNPH_project/Tretow_et_al_2023"

tcgaDir <- paste0(projectDir, "/8_TCGA_analysis")

1 Background

The data so far led to the identification of CE and ALE events showing cooperatively enhanced and repressed HNRNPH regulation. As the RNA-seq experiments were performed in the human breast cancer cell line MCF7, we raised the question whether these events show a similar dependence on HNRNPH levels and direction of regulation in the TCGA BRCA cohort. To test this, expression of HNRNPH1 and HNRNPH2 and PSI levels of the enhanced and repressed CE and ALE events were computed for each BRCA tumor sample. In addition, the non-regulated CE events were also checked to see if these are also not affected in the BRCA samples.

2 Data

Vst-transformed and batch corrected expression values are loaded from a RDS-File created with Calculate_expression_BRCA.R.

Sample meta information is loaded from a RDS-File created with Calculate_expression_BRCA.R.

PSI values of junctions of interest are loaded from a RDS-File created with Calculate_PSIs_BRCA.R.

vsdBatchCorrected<- readRDS(paste0(tcgaDir,"/rds_files/BRCA_vsdBatchCorrected.rds"))

metaInformation <- readRDS(paste0(tcgaDir,"/rds_files/BRCA_metaInformation.rds"))

PSIs <- readRDS(paste0(tcgaDir,"/rds_files/BRCA_PSI.rds"))

A data.frame with the expression levels of HNRNPH1 (ENSG00000169045.17) and HNRNPH2 (ENSG00000126945.8) is created.

# Each row is a sample and columns are the samplebarcode and HNRNPH1 and HNRNPH2
#   expression levels
HNRNPHexpression <- data.frame(geneName=c("HNRNPH1", "HNRNPH2"),
                               vsdBatchCorrected[match(c("ENSG00000169045.17",
                                                         "ENSG00000126945.8"),
                                                       rownames(vsdBatchCorrected)),] %>%
                                   assay) %>%
    pivot_longer(., cols=-1, names_to="tcgaSampleBarcode", values_to="vst") %>%
    mutate(tcgaSampleBarcode=gsub(".", "-", tcgaSampleBarcode, fixed = T)) %>% 
    pivot_wider(., names_from = geneName, values_from=vst)

# The participant barcode and the sample type are added 
HNRNPHexpression <- HNRNPHexpression %>% 
    left_join(., metaInformation %>% dplyr::select(tcga.tcga_barcode,
                                                   tcga.gdc_cases.submitter_id,
                                                   tcga.gdc_cases.samples.sample_type),
              by=c("tcgaSampleBarcode" = "tcga.tcga_barcode")) %>%
    dplyr::rename(tcgaPatientBarcode = tcga.gdc_cases.submitter_id,
           sampleType = tcga.gdc_cases.samples.sample_type)

Splicing events are reduced to those of the category “Coop-Enh”, “Coop-Rep” and “nonReg”. Note that Coop-Enh and Coop-Rep comprise CE and ALE events.

# Remove the quantifications od the HNRNPH1 NMD Isoform and 
#   use for the non-regulated CE events "nonReg" as Hill category
PSIs <- PSIs %>% 
    dplyr::filter(eventType != "HNRNPH1nmdIsoform") %>%
    dplyr::mutate(hillCat = ifelse(is.na(hillCat), "nonReg", hillCat))

# Filter for the three Hill categories of interest
PSIs <- PSIs %>% dplyr::filter(hillCat %in% c("Coop-Enh", "Coop-Rep", "nonReg"))

3 HNRNPH1 and HNRNPH2 levels in normal and tumor samples

To check if there is a difference in HNRNPH1/2 levels between normal and tumor samples, I plot the respective expression levels for the 108 patients with available normal and tumor quantifications.

HNRNPHexpression %>% group_by(tcgaPatientBarcode) %>% filter(n() ==2) %>% ungroup %>% arrange(tcgaPatientBarcode, sampleType) %>%
    pivot_longer(., cols=c("HNRNPH1", "HNRNPH2"), names_to="gene", values_to="vst") %>%
    mutate(., sampleType = factor(sampleType, levels=c("Solid Tissue Normal", "Primary Tumor"), labels = c("Normal", "Tumor"))) %>%
    ggplot(., aes(x=sampleType, y=vst, fill=sampleType)) +
    geom_boxplot(width=.2, alpha=1, position = position_nudge(x = c(-.15, .15)), outlier.size = -1) +
    geom_point(pch=21, size=3, alpha=0.5) +
    geom_path(mapping=aes(group=tcgaPatientBarcode), size=1, alpha=0.5) +
    scale_y_continuous(breaks=1:15) +
    labs(x = "Sample type", y="Expression (vst)") +
    stat_compare_means(paired=TRUE) +
    facet_wrap(~gene, ncol=2, scales = "free_y") +
    myTheme +
    theme(aspect.ratio = 2/1)

4 Correlation HNRNPH1 and HNRNPH2

Next, the correlation of HNRNPH1 and HNRNPH2 levels was checked for tumor samples.

ggplot(HNRNPHexpression %>% dplyr::filter(sampleType == "Primary Tumor"), aes(x=HNRNPH1, y=HNRNPH2)) +
    rasterise(geom_pointdensity(size=3), dpi=300) +
    geom_smooth(method="lm", se=FALSE) +
    scale_x_continuous(breaks=1:16) +
    scale_y_continuous(breaks=1:16) +
    scale_color_viridis() +
    stat_cor() +
    labs(x="HNRNPH1 expression (vst)", y="HNRNPH2 expression (vst)") +
    myTheme +
    theme(aspect.ratio=1)

5 Correlations between HNRNPH1/2 and Enhanced, Repressed and Ctrl events

The PSI values cooperatively enhanced and repressed CE and ALE events as well as the non-regulated CE events are correlated with the expression of HNRNPH1 and HNRNPH2.

Correlations were performed for an event if at least 10 tumor samples had a PSI value for the event. Correlations were considered significant if their adjusted P value (BH correction) was \(\le\) 0.05. Boxplots are are based on significant and non-signififcant correlations. Significance is shown by the transparency (alpha) of the points.

corRes <- lapply(PSIs$eventID %>% unique, function(tmpEventID){
    
    # Combine HNRNPH expression levels and PSI values in a single data.frame
    df <- inner_join(HNRNPHexpression,
                         PSIs %>%
                             dplyr::filter(eventID == tmpEventID) %>%
                             dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI),
                     by = c("tcgaSampleBarcode", "tcgaPatientBarcode", "sampleType")) %>%
        dplyr::filter(sampleType=="Primary Tumor")
        
    # If there are less than 10 samples return an empty data.frame
    if(nrow(df) < 10){
        return(data.frame())
    }
    
    # Create the output data.frame
    df <- data.frame(gene=c("HNRNPH1", "HNRNPH2"),
                     eventID=tmpEventID,
                     hillCat= PSIs %>% dplyr::filter(eventID == tmpEventID) %>%
                         pull(hillCat) %>% unique,
                     r=c(cor.test(df$HNRNPH1,
                                  df$PSI)$estimate,
                         cor.test(df$HNRNPH2,
                                  df$PSI)$estimate),
                     pval=c(cor.test(df$HNRNPH1,
                                     df$PSI)$p.value,
                            cor.test(df$HNRNPH2,
                                     df$PSI)$p.value))
    return(df)
}) %>% bind_rows()

# Do the FDR corecction
corRes <- corRes %>% mutate(pvalAdj = p.adjust(pval, method = "BH")) %>%
    mutate(sign=ifelse(pvalAdj <= 0.05, TRUE, FALSE))

corRes$hillCat <- factor(corRes$hillCat, levels=c("Coop-Enh","nonReg", "Coop-Rep"), labels=c("Enhanced", "Ctrl", "Repressed"))

ggplot(mapping=aes(x=gene, y=r, col=hillCat, fill=hillCat, alpha=sign)) +
        geom_quasirandom(data=corRes %>% dplyr::filter(sign), dodge.width = .8, size=1.25) +
        geom_quasirandom(data=corRes %>% dplyr::filter(!sign), dodge.width = .8, size=1.25) +
        geom_boxplot(data=corRes, alpha=.5, outlier.size = -1, col="black",
                     position = position_dodge(width = .8), width = .7) +
        scale_fill_manual(values=c("Enhanced"="cornflowerblue", "Repressed"="salmon2", "Ctrl"="#606060")) +
        scale_color_manual(values=c("Enhanced"="cornflowerblue", "Repressed"="salmon2", "Ctrl"="#606060")) +
        scale_alpha_manual(values=c("TRUE" = 1, "FALSE" = .2)) +
        coord_cartesian(ylim=c(-1, 1)) +
        scale_y_continuous(breaks=seq(-1, 1, by=0.5)) +
        labs(x="", y="Correlation coefficnt") +
        myTheme + theme(legend.position = "right", aspect.ratio=1)

At least for HNRNPH2 we can see the expected regulation, namely that enhanced CE and ALE events are positively correlated with HNRNPH2 levels and repressed CE and ALE events negatively. The non-regulated events have a median correlation coefficient of around 0.

6 Correlation examples

6.1 IL17RC - Repressed

id <- "ENSG00000163702.20_7_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=1) +
        labs(x="HNRNPH1 expression (vst)", y="IL17RC exon 18 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=1) +
        labs(x="HNRNPH2 expression (vst)", y="IL17RC exon 18 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

6.2 NFE2L1 - Enhanced

id <- "ENSG00000082641.16_4_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,.8)) +
        scale_color_viridis() +
        stat_cor(label.y=.8) +
        labs(x="HNRNPH1 expression (vst)", y="NFE2L1 exon 5 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,.8)) +
        scale_color_viridis() +
        stat_cor(label.y=.8) +
        labs(x="HNRNPH2 expression (vst)", y="NFE2L1 exon 5 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

6.3 MYL6 - Repressed

id <- "ENSG00000092841.19_2_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=1) +
        labs(x="HNRNPH1 expression (vst)", y="MYL6 exon 6 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=1) +
        labs(x="HNRNPH2 expression (vst)", y="MYL6 exon 6 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

6.4 PIP4K2C - Enhanced

id <- "ENSG00000166908.18_1_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0.6,1)) +
        scale_color_viridis() +
        stat_cor(label.y=.6) +
        labs(x="HNRNPH1 expression (vst)", y="PIP4K2C exon 5 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0.6,1)) +
        scale_color_viridis() +
        stat_cor(label.y=.6) +
        labs(x="HNRNPH2 expression (vst)", y="PIP4K2C exon 5 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

6.5 MST1R - Repressed

id <- "ENSG00000164078.14_1_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=0) +
        labs(x="HNRNPH1 expression (vst)", y="MST1R exon 11 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0,1)) +
        scale_color_viridis() +
        stat_cor(label.y=0) +
        labs(x="HNRNPH2 expression (vst)", y="MST1R exon 11 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

6.6 AKR1A1 - Enhanced

id <- "ENSG00000117448.14_3_CE_1"

plotDF <- inner_join(HNRNPHexpression,
                     PSIs %>% dplyr::filter(eventID == id) %>% dplyr::select(tcgaPatientBarcode, tcgaSampleBarcode, sampleType, PSI)
                     ) %>% dplyr::filter(sampleType=="Primary Tumor") 

ggplot(plotDF, aes(x=HNRNPH1, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0.4,1)) +
        scale_color_viridis() +
        stat_cor(label.y=0.4) +
        labs(x="HNRNPH1 expression (vst)", y="AKR1A1 exon 7 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

ggplot(plotDF, aes(x=HNRNPH2, y=PSI)) +
        rasterise(geom_pointdensity(size=3), dpi=300) +
        geom_smooth(method="lm", se=FALSE) +
        scale_x_continuous(breaks=1:16) +
        coord_cartesian(ylim=c(0.4,1)) +
        scale_color_viridis() +
        stat_cor(label.y=0.4) +
        labs(x="HNRNPH2 expression (vst)", y="AKR1A1 exon 7 (PSI)") +
        myTheme +
        theme(aspect.ratio = 1)

7 Session Information

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0                viridis_0.6.1              
##  [3] viridisLite_0.4.0           ggpointdensity_0.1.0       
##  [5] ggrastr_0.2.3               kableExtra_1.3.4           
##  [7] ggbeeswarm_0.6.0            knitr_1.36                 
##  [9] DESeq2_1.33.4               SummarizedExperiment_1.23.4
## [11] Biobase_2.53.0              MatrixGenerics_1.5.4       
## [13] matrixStats_0.61.0          GenomicRanges_1.45.0       
## [15] GenomeInfoDb_1.29.8         IRanges_2.27.2             
## [17] S4Vectors_0.31.4            BiocGenerics_0.39.2        
## [19] forcats_0.5.1               stringr_1.4.0              
## [21] dplyr_1.0.9                 purrr_0.3.4                
## [23] readr_2.1.2                 tidyr_1.1.4                
## [25] tibble_3.1.8                ggplot2_3.4.0              
## [27] tidyverse_1.3.2             BiocStyle_2.22.0           
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.2.1        systemfonts_1.0.2     
##   [4] splines_4.1.0          BiocParallel_1.27.10   digest_0.6.28         
##   [7] htmltools_0.5.2        fansi_0.5.0            magrittr_2.0.3        
##  [10] memoise_2.0.0          googlesheets4_1.0.1    tzdb_0.1.2            
##  [13] openxlsx_4.2.4         Biostrings_2.61.2      annotate_1.71.0       
##  [16] modelr_0.1.8           svglite_2.0.0          colorspace_2.0-2      
##  [19] blob_1.2.2             rvest_1.0.2            haven_2.4.3           
##  [22] xfun_0.26              crayon_1.5.1           RCurl_1.98-1.5        
##  [25] jsonlite_1.7.2         genefilter_1.75.1      survival_3.2-13       
##  [28] glue_1.6.2             gtable_0.3.0           gargle_1.2.0          
##  [31] zlibbioc_1.39.0        XVector_0.33.0         webshot_0.5.2         
##  [34] DelayedArray_0.19.4    car_3.0-11             abind_1.4-5           
##  [37] scales_1.2.1           DBI_1.1.1              rstatix_0.7.0         
##  [40] Rcpp_1.0.7             xtable_1.8-4           foreign_0.8-81        
##  [43] bit_4.0.4              httr_1.4.2             RColorBrewer_1.1-2    
##  [46] ellipsis_0.3.2         pkgconfig_2.0.3        XML_3.99-0.8          
##  [49] farver_2.1.0           sass_0.4.5             dbplyr_2.1.1          
##  [52] locfit_1.5-9.4         utf8_1.2.2             tidyselect_1.1.1      
##  [55] labeling_0.4.2         rlang_1.0.6            AnnotationDbi_1.55.1  
##  [58] munsell_0.5.0          cellranger_1.1.0       tools_4.1.0           
##  [61] cachem_1.0.6           cli_3.6.0              generics_0.1.0        
##  [64] RSQLite_2.2.8          broom_1.0.0            evaluate_0.14         
##  [67] fastmap_1.1.0          yaml_2.2.1             bit64_4.0.5           
##  [70] fs_1.6.1               zip_2.2.0              KEGGREST_1.33.0       
##  [73] nlme_3.1-153           xml2_1.3.3             compiler_4.1.0        
##  [76] rstudioapi_0.13        beeswarm_0.4.0         curl_4.3.2            
##  [79] png_0.1-7              ggsignif_0.6.3         reprex_2.0.2          
##  [82] geneplotter_1.71.0     bslib_0.3.0            stringi_1.7.4         
##  [85] highr_0.9              lattice_0.20-45        Matrix_1.3-4          
##  [88] vctrs_0.5.1            pillar_1.8.0           lifecycle_1.0.3       
##  [91] BiocManager_1.30.16    jquerylib_0.1.4        data.table_1.14.2     
##  [94] bitops_1.0-7           R6_2.5.1               bookdown_0.24         
##  [97] gridExtra_2.3          rio_0.5.27             vipor_0.4.5           
## [100] assertthat_0.2.1       withr_2.5.0            GenomeInfoDbData_1.2.7
## [103] mgcv_1.8-37            parallel_4.1.0         hms_1.1.1             
## [106] grid_4.1.0             rmarkdown_2.11         carData_3.0-4         
## [109] googledrive_2.0.0      Cairo_1.5-12.2         lubridate_1.8.0